1 Data

Consider data from example used in Statistical Analysis and Machine Learning and Predictive Analytics. Compare results with workshop.

myData <- swiss

2 Questions

Data list

x <- myData %>% select(-Fertility)
data_list <- list(
  Ntotal = nrow(x),
  Nx = ncol(x),
  x = as.matrix(x),
  y = myData$Fertility
)

2.1 Fit robust linear regression for response Fertility using all available predictors without shrinkage

data {
    int<lower=1> Ntotal;
    int<lower=1> Nx;
    vector[Ntotal] y;
    matrix[Ntotal, Nx] x;
}
transformed data {
    real expLambda;
    real mean_y;
    real sd_y;
    real unifLo;
    real unifHi;
    expLambda = 1 / 30.0;
    mean_y = mean(y);
    sd_y = sd(y);
    unifLo = sd_y / 1000;
    unifHi = sd_y * 1000;
}
parameters {
    real<lower=0> nu;
    real beta0;
    vector[Nx] xbeta;
    real<lower=0> sigma;
}
transformed parameters{
    vector[Ntotal] y_hat;
    y_hat = beta0 + x * xbeta;
}
model {
  nu ~ exponential(expLambda);
  sigma ~ uniform(unifLo, unifHi);
  y ~ student_t(nu, y_hat, sigma);
}
fit_robust_all_preds <- sampling(
  model_robust_all_preds,
  data = data_list,
  pars = c(
    "beta0",
    "xbeta",
    "sigma",
    "nu"
  ),
  iter = 2500,
  chains = 4,
  cores = 4,
  control = list(
    max_treedepth = 15
  )
)

Probably not robust due to high \(\nu\).

plot(fit_robust_all_preds, pars = "nu")

Analyze correlations between slope parameters and interpret them

The most notable correlations:

  • \(\beta_0\) and \(\beta_5\) have a strong negative relationship

  • \(\beta_0\) and \(\beta_1\) also have a noticably strong negative relationship

  • There are several others with correlations higher than zero, but none to be alarmed about (no narrow tunnel)

pairs(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))

Traces look good and no auto-correlation issues:

stan_trace(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))

stan_ac(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))

We have smooth densities with no shrinkage:

stan_dens(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))

Which slope parameters are not significant based on 95%-HDI?

\(\beta_1,\beta_3,\beta_4,\beta_5\) are significant. Note that \(\beta_1\) and \(\beta_4\) is close to zero, but still significant based on 95% HDI.

plot(fit_robust_all_preds, pars = "xbeta")

betas <- rstan::extract(fit_robust_all_preds, pars = c("beta0", "xbeta"))
hdi(betas$beta0)
##    lower    upper 
## 46.06266 88.22377 
## attr(,"credMass")
## [1] 0.95
hdi(betas$xbeta)
##        
##                [,1]       [,2]       [,3]       [,4]      [,5]
##   lower -0.31913810 -0.7861737 -1.2391002 0.03280222 0.3214482
##   upper -0.03401911  0.2335651 -0.4687047 0.17688555 1.8540699
## attr(,"credMass")
## [1] 0.95

Does the model satisfy Gaussian assumption?

\(\nu\) is high but the HDI is very wide (and includes below 10). Therefore, it is difficult to conclude that Gaussian assumption is satisfied given the uncertainty of \(\nu\)

nu <- rstan::extract(fit_robust_all_preds, pars = "nu")
hdi(nu)
## $nu
##     lower     upper 
##  2.097622 95.926288 
## attr(,"credMass")
## [1] 0.95

2.2 Fit robust linear regression for response Fertility using all available predictors with shrinkage

data {
    int<lower=1> Ntotal;
    int<lower=1> Nx;
    vector[Ntotal] y;
    matrix[Ntotal, Nx] x;
}
transformed data {
    real expLambda;
    real meanY;
    real sdY;
    real unifLo;
    real unifHi;
    vector[Ntotal] zy; // normalized
    vector[Nx] meanX;
    vector[Nx] sdX;
    matrix[Ntotal, Nx] zx; // normalized
    
    expLambda = 1 / 30.0;
    meanY = mean(y);
    sdY = sd(y);
    unifLo = sdY / 1000;
    unifHi = sdY * 1000;
    zy = (y - meanY) / sdY;
    
    for (j in 1:Nx) {
        meanX[j] = mean(x[, j]);
        sdX[j] = sd(x[, j]);
        for (i in 1:Ntotal) {
            zx[i, j] = (x[i, j] - meanX[j]) / sdX[j];
        }
    }
}
parameters {
    real zbeta0;
    real<lower=0> sigmaBeta;
    vector[Nx] zbeta;
    real<lower=0> nu;
    real<lower=0> zsigma;
}
transformed parameters{
    vector[Ntotal] zy_hat;
    zy_hat = zbeta0 + zx * zbeta;
}
model {
    zbeta0 ~ normal(0, 2);
    sigmaBeta ~ gamma(2.3,1.3); // mode 0, sd 0.5 
    // we want to allow sigma to be as small as necessary; we want it to be zero, 
    // but let it deviate; when sigma close to zero, sigma is much stronger
    zbeta  ~ student_t(expLambda, 0, sigmaBeta);
    nu ~ exponential(expLambda);
    zsigma ~ uniform(unifLo, unifHi);
    zy ~ student_t(1 + nu, zy_hat, zsigma);
}
generated quantities { 
    // Transform to original scale:
    real beta0; 
    vector[Nx] beta;
    real sigma;
    // .* and ./ are element-wise product and divide
    beta0 = zbeta0 * sdY  + meanY - sdY * sum(zbeta .* meanX ./ sdX);
    beta = sdY * (zbeta ./ sdX);
    sigma = zsigma * sdY;
}
fit_robust_all_preds_shrinkage <- sampling(
  model_robust_all_preds_shrinkage,
  data = data_list,
  pars = c(
    "beta0",
    "beta",
    "sigma",
    "nu"
  ),
  iter = 5000,
  chains = 4,
  cores = 4
)

Shrinkage is not strong:

plot(fit_robust_all_preds_shrinkage, pars = "beta")

stan_dens(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))

Modify parameters of prior distribution for \(\sigma\beta\) from values in the workshop in order to make shrinkage stronger

If we reduce the parameters of \(\gamma\), the distribution moves closer to zero, which should influence \(\sigma\beta\).

data.frame(new_gamma = dgamma(seq(0, 1, .01), .8, 1.5),
           original_gamma = dgamma(seq(0, 1, .01), 1.3, 2.3)) %>%
  gather(key = "parameter", value = value) %>% 
  ggplot(aes(value, fill = parameter)) +
  geom_density(alpha = .75, color = "#d9d9d9") +
  scale_fill_viridis_d(name = NULL, labels = c(expression(dgamma(.8, 1.5)), expression(dgamma(1.3, 2.3)))) +
  labs(title = latex2exp::TeX("New $\\gamma$ Encourages Movement to Zero"),
       x = NULL, y = "Density") +
  theme(legend.position = "top")

data {
    int<lower=1> Ntotal;
    int<lower=1> Nx;
    vector[Ntotal] y;
    matrix[Ntotal, Nx] x;
}
transformed data {
    real expLambda;
    real meanY;
    real sdY;
    real unifLo;
    real unifHi;
    vector[Ntotal] zy; // normalized
    vector[Nx] meanX;
    vector[Nx] sdX;
    matrix[Ntotal, Nx] zx; // normalized
    
    expLambda = 1 / 30.0;
    meanY = mean(y);
    sdY = sd(y);
    unifLo = sdY / 1000;
    unifHi = sdY * 1000;
    zy = (y - meanY) / sdY;
    
    for (j in 1:Nx) {
        meanX[j] = mean(x[, j]);
        sdX[j] = sd(x[, j]);
        for (i in 1:Ntotal) {
            zx[i, j] = (x[i, j] - meanX[j]) / sdX[j];
        }
    }
}
parameters {
    real zbeta0;
    real<lower=0> sigmaBeta;
    vector[Nx] zbeta;
    real<lower=0> nu;
    real<lower=0> zsigma;
}
transformed parameters{
    vector[Ntotal] zy_hat;
    zy_hat = zbeta0 + zx * zbeta;
}
model {
    zbeta0 ~ normal(0, 2);
    sigmaBeta ~ gamma(.8, 1.5); // mode 0, sd 0.5 
    // we want to allow sigma to be as small as necessary; we want it to be zero, 
    // but let it deviate; when sigma close to zero, sigma is much stronger
    zbeta  ~ student_t(expLambda, 0, sigmaBeta);
    nu ~ exponential(expLambda);
    zsigma ~ uniform(unifLo, unifHi);
    zy ~ student_t(1 + nu, zy_hat, zsigma);
}
generated quantities { 
    // Transform to original scale:
    real beta0; 
    vector[Nx] beta;
    real sigma;
    // .* and ./ are element-wise product and divide
    beta0 = zbeta0 * sdY  + meanY - sdY * sum(zbeta .* meanX ./ sdX);
    beta = sdY * (zbeta ./ sdX);
    sigma = zsigma * sdY;
}
fit_robust_all_preds_shrinkage_more <- sampling(
  model_robust_all_preds_shrinkage_more,
  data = data_list,
  pars = c(
    "beta0",
    "beta",
    "sigma",
    "nu"
  ),
  iter = 5000,
  chains = 4,
  cores = 4,
  warmup = 2500,
  thin = 5,
  control = list(
    max_treedepth = 20,
    adapt_delta = .93
  )
)

Which parameters shrunk to become insignificant in model with shrinkage?

\(\beta_1\) and \(\beta_2\) had the most skrinkage. \(\beta5\) became insignificant:

stan_dens(fit_robust_all_preds_shrinkage_more, pars = c("beta0", "beta","sigma", "nu"))

  • \(\beta_1\), \(\beta_2\), \(\beta_4\), and \(\beta_5\) HDI now include zero.

  • \(\beta_4\) and \(\beta_5\) barely include zero.

plot(fit_robust_all_preds_shrinkage_more, pars = "beta")

hdi(rstan::extract(fit_robust_all_preds_shrinkage_more, pars = "beta"))
## $beta
##        
##                [,1]       [,2]       [,3]          [,4]         [,5]
##   lower -0.27202616 -0.7479470 -1.2104803 -0.0006538085 -0.009799926
##   upper  0.02389413  0.1811659 -0.4045162  0.1621068403  1.815403081
## attr(,"credMass")
## [1] 0.95
hdi(rstan::extract(fit_robust_all_preds_shrinkage, pars = "beta"))
## $beta
##        
##                [,1]       [,2]       [,3]       [,4]      [,5]
##   lower -0.27595542 -0.7734357 -1.2029135 0.01667599 0.2200677
##   upper  0.01099398  0.1912987 -0.4261835 0.16629623 1.8840757
## attr(,"credMass")
## [1] 0.95
hdi(rstan::extract(fit_robust_all_preds, pars = "xbeta"))
## $xbeta
##        
##                [,1]       [,2]       [,3]       [,4]      [,5]
##   lower -0.31913810 -0.7861737 -1.2391002 0.03280222 0.3214482
##   upper -0.03401911  0.2335651 -0.4687047 0.17688555 1.8540699
## attr(,"credMass")
## [1] 0.95

When introducting skrinkage, there was auto-correlation problems which were fixed by adjusting some parameters.

stan_ac(fit_robust_all_preds_shrinkage_more)

2.3 Run model selection like in Workshop 2

modelString = "
    # Standardize the data:
    data {
        ym <- mean(y)
        ysd <- sd(y)
        for ( i in 1:Ntotal ) {
            zy[i] <- ( y[i] - ym ) / ysd
        }
        for ( j in 1:Nx ) {
            xm[j]  <- mean(x[,j])
            xsd[j] <-   sd(x[,j])
            for ( i in 1:Ntotal ) {
                zx[i,j] <- ( x[i,j] - xm[j] ) / xsd[j]
            }
        }
    }
    # Specify the model for standardized data:
    model {
        for ( i in 1:Ntotal ) {
            zy[i] ~ dt( zbeta0 + sum( delta[1:Nx] * zbeta[1:Nx] * zx[i,1:Nx] ) ,  1/zsigma^2 , nu )
        }
        # Priors vague on standardized scale:
        zbeta0 ~ dnorm( 0 , 1/2^2 )
        for ( j in 1:Nx ) {
            zbeta[j] ~ dt( 0 , 1/sigmaBeta^2 , 1 ) 
            delta[j] ~ dbern( 0.5 )
        }
        zsigma ~ dunif( 1.0E-5 , 1.0E+1 )
        ## Uncomment one of the following specifications for sigmaBeta:
        # sigmaBeta <- 2.0
        # sigmaBeta ~ dunif( 1.0E-5 , 1.0E+2 )
        sigmaBeta ~ dgamma(1.1051,0.1051) # mode 1.0, sd 10.0
        # sigmaBeta <- 1/sqrt(tauBeta) ; tauBeta ~ dgamma(0.001,0.001) 
        nu ~ dexp(1/30.0)
        # Transform to original scale:
        beta[1:Nx] <- ( delta[1:Nx] * zbeta[1:Nx] / xsd[1:Nx] )*ysd
        beta0 <- zbeta0*ysd  + ym - sum( delta[1:Nx] * zbeta[1:Nx] * xm[1:Nx] / xsd[1:Nx] )*ysd
        sigma <- zsigma*ysd
    }
"
parameters <- c("beta0",
    "beta",
    "sigma",
    "delta",
    "sigmaBeta",
    "zbeta0",
    "zbeta",
    "zsigma",
    "nu")

adaptSteps <- 500
burnInSteps <- 1000
numSavedSteps <- 15000
thinSteps <- 25
nChains <- 3

runjagsMethod <- "parallel"  # change to "rjags" in case of working on 1-core cpu

runJagsOut <- run.jags(
  method = runjagsMethod,
  model = modelString,
  monitor = parameters,
  data = data_list,
  n.chains = nChains,
  adapt = adaptSteps,
  burnin = burnInSteps,
  sample = ceiling(numSavedSteps / nChains),
  thin = thinSteps,
  summarise = FALSE,
  plots = FALSE
)

Diagnostics:

# plot all params
plot(runJagsOut,
     plot.type = c("trace", "ecdf", "histogram", "autocorr")
     )
## Generating summary statistics and plots (these will NOT be saved
## for reuse)...
## Calculating summary statistics...
## Calculating the Gelman-Rubin statistic for 21 variables....

Order predictors using their inclusion probabilities as a measure of relative importance

trajectoriesDelta <- as.matrix(runJagsOut$mcmc[, 8:12])
Nchain <- nrow(trajectoriesDelta)
trajectoriesDelta %>% head()
##      delta[1] delta[2] delta[3] delta[4] delta[5]
## [1,]        1        0        1        1        0
## [2,]        1        0        1        1        0
## [3,]        1        1        1        1        0
## [4,]        0        0        1        1        1
## [5,]        0        0        1        1        1
## [6,]        1        1        1        1        1

Inclusion Probabilities:

map_dbl(1:5, ~ sum(trajectoriesDelta[, .x] == 1) / Nchain) %>% 
  set_names(paste0("Delta ", dimnames(data_list$x)[[2]])) %>% 
  bind_rows() %>% 
  gather(key = "delta", value = value) %>% 
  ggplot(aes(fct_rev(fct_reorder(delta, value)), value)) +
  geom_col(fill = "skyblue") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Variable Importance MCMC",
       x = "Delta",
       y = "MCMC Delta Proportion")

Compare relevance of the following sub-models based on their observed frequencies:

  1. Fertility ~ Agriculture + Examination

  2. Fertility ~ Education + Catholic + Infant.Mortality

  3. Fertility ~ Agriculture + Education + Catholic + Infant.Mortality

  4. Fertility ~ .

Fertility ~ Agriculture + Education + Catholic + Infant.Mortality was the most relevant with the highest frequency proportion of ~32%.

delta_checks <- list("Agriculture + Examination" = c(1, 1, 0, 0, 0),
                     "Education + Catholic + Infant.Mortality" = c(0, 0, 1, 1, 1),
                     "Agriculture + Education + Catholic + Infant.Mortality" = c(1, 0, 1, 1, 1),
                     "All predictors" = c(1, 1, 1, 1, 1))


map(delta_checks, ~ sum(apply(trajectoriesDelta, 1, function(z) prod(z == .x)) / Nchain)) %>% 
  set_names(paste0("Delta ", delta_checks %>% names)) %>% 
  bind_rows() %>% 
  gather(key = "delta", value = value) %>% 
  ggplot(aes(fct_reorder(delta, value), value)) +
  geom_col(fill = "skyblue") +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(title = "MCMC Sub-model comparison",
       x = "Delta",
       y = "MCMC Delta Frequency Proportion")